{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "67c1e6b1",
   "metadata": {},
   "source": [
    "\n",
    "\n",
    "下面的例子将展示词向量标准工具包——gensim提供的词嵌入，并展示词嵌入如何表示词的相似度。\n",
    "<!-- https://nlp.stanford.edu/projects/glove/ -->"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "5c5a740a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pprint\n",
    "\n",
    "from gensim.models import KeyedVectors\n",
    "\n",
    "# 从GloVe官网下载GloVe向量，此处使用的是glove.6B.zip\n",
    "# 解压缩zip文件并将以下路径改为解压后对应文件的路径\n",
    "model = KeyedVectors.load_word2vec_format('F:/vscode/社会舆情/glove.6B.100d.txt', binary=False, no_header=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "2c55d0cc",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "anyio 3.5.0 has requirement idna>=2.8, but you have idna 2.5.\n",
      "mysql-connector-python 8.2.0 has requirement protobuf<=4.21.12,>=4.21.1, but you have protobuf 3.20.3.\n",
      "numba 0.57.0 has requirement numpy<1.25,>=1.21, but you have numpy 1.26.4.\n",
      "python-lsp-black 1.2.1 has requirement black>=22.3.0, but you have black 0.0.\n",
      "tables 3.8.0 has requirement blosc2~=2.0.0, but you have blosc2 3.3.2.\n",
      "Note: you may need to restart the kernel to use updated packages.\n"
     ]
    }
   ],
   "source": [
    "pip check"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "11aaab18",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: gensim in d:\\py\\lib\\site-packages (4.3.0)\n",
      "Requirement already satisfied: numpy>=1.18.5 in d:\\py\\lib\\site-packages (from gensim) (1.26.4)\n",
      "Requirement already satisfied: scipy>=1.7.0 in d:\\py\\lib\\site-packages (from gensim) (1.10.1)\n",
      "Requirement already satisfied: smart-open>=1.8.1 in d:\\py\\lib\\site-packages (from gensim) (5.2.1)\n",
      "Collecting FuzzyTM>=0.4.0 (from gensim)\n",
      "  Obtaining dependency information for FuzzyTM>=0.4.0 from https://files.pythonhosted.org/packages/2d/30/074bac7a25866a2807c1005c7852c0139ac22ba837871fc01f16df29b9dc/FuzzyTM-2.0.9-py3-none-any.whl.metadata\n",
      "  Downloading FuzzyTM-2.0.9-py3-none-any.whl.metadata (7.9 kB)\n",
      "Requirement already satisfied: pandas in d:\\py\\lib\\site-packages (from FuzzyTM>=0.4.0->gensim) (1.5.3)\n",
      "Collecting pyfume (from FuzzyTM>=0.4.0->gensim)\n",
      "  Obtaining dependency information for pyfume from https://files.pythonhosted.org/packages/ed/ea/a3b120e251145dcdb10777f2bc5f18b1496fd999d705a178c1b0ad947ce1/pyFUME-0.3.4-py3-none-any.whl.metadata\n",
      "  Downloading pyFUME-0.3.4-py3-none-any.whl.metadata (9.7 kB)\n",
      "Requirement already satisfied: python-dateutil>=2.8.1 in d:\\py\\lib\\site-packages (from pandas->FuzzyTM>=0.4.0->gensim) (2.8.2)\n",
      "Requirement already satisfied: pytz>=2020.1 in d:\\py\\lib\\site-packages (from pandas->FuzzyTM>=0.4.0->gensim) (2022.7)\n",
      "Collecting numpy>=1.18.5 (from gensim)\n",
      "  Obtaining dependency information for numpy>=1.18.5 from https://files.pythonhosted.org/packages/d8/ec/ebef2f7d7c28503f958f0f8b992e7ce606fb74f9e891199329d5f5f87404/numpy-1.24.4-cp311-cp311-win_amd64.whl.metadata\n",
      "  Downloading numpy-1.24.4-cp311-cp311-win_amd64.whl.metadata (5.6 kB)\n",
      "Collecting simpful==2.12.0 (from pyfume->FuzzyTM>=0.4.0->gensim)\n",
      "  Obtaining dependency information for simpful==2.12.0 from https://files.pythonhosted.org/packages/9d/0e/aebc2fb0b0f481994179b2ee2b8e6bbf0894d971594688c018375e7076ea/simpful-2.12.0-py3-none-any.whl.metadata\n",
      "  Downloading simpful-2.12.0-py3-none-any.whl.metadata (4.8 kB)\n",
      "INFO: pip is looking at multiple versions of pyfume to determine which version is compatible with other requirements. This could take a while.\n",
      "Collecting pyfume (from FuzzyTM>=0.4.0->gensim)\n",
      "  Obtaining dependency information for pyfume from https://files.pythonhosted.org/packages/f0/fe/b899a3d9a18c9a44a35155c79a4c152cb85990ea38ce6ab7ed73e5caa1b9/pyFUME-0.3.1-py3-none-any.whl.metadata\n",
      "  Downloading pyFUME-0.3.1-py3-none-any.whl.metadata (9.7 kB)\n",
      "  Obtaining dependency information for pyfume from https://files.pythonhosted.org/packages/23/40/4b3ef6f6da7bcc35ae6936cf36afab1f6fbe417b95b1e0015ede54b5365b/pyFUME-0.3.0-py3-none-any.whl.metadata\n",
      "  Downloading pyFUME-0.3.0-py3-none-any.whl.metadata (9.6 kB)\n",
      "  Obtaining dependency information for pyfume from https://files.pythonhosted.org/packages/0f/81/8f35156b9af9c9585de8518b354e42bcaf3ee3e5a77e530613c37de22b1e/pyFUME-0.2.25-py3-none-any.whl.metadata\n",
      "  Downloading pyFUME-0.2.25-py3-none-any.whl.metadata (9.6 kB)\n",
      "  Obtaining dependency information for pyfume from https://files.pythonhosted.org/packages/b9/a1/ccaed0f04d6ad05e06f906873327188749698f460acbfa7ad5de15795421/pyFUME-0.2.24-py3-none-any.whl.metadata\n",
      "  Downloading pyFUME-0.2.24-py3-none-any.whl.metadata (9.6 kB)\n",
      "  Obtaining dependency information for pyfume from https://files.pythonhosted.org/packages/73/eb/f1e5a7e389a268c85f375807348d457bc309e586ab4e146966d3f163a10d/pyFUME-0.2.23-py3-none-any.whl.metadata\n",
      "  Downloading pyFUME-0.2.23-py3-none-any.whl.metadata (9.6 kB)\n",
      "  Obtaining dependency information for pyfume from https://files.pythonhosted.org/packages/f8/10/3459cd7cc9d803ef6013aa332c4c46cbcb65b0b9376f4e6f24b35b559209/pyFUME-0.2.22-py3-none-any.whl.metadata\n",
      "  Downloading pyFUME-0.2.22-py3-none-any.whl.metadata (9.6 kB)\n",
      "  Obtaining dependency information for pyfume from https://files.pythonhosted.org/packages/2e/9b/30cbbf372b56ec54101ff29705a32d7824f542c0e8f4a70de32550be0d61/pyFUME-0.2.20-py3-none-any.whl.metadata\n",
      "  Downloading pyFUME-0.2.20-py3-none-any.whl.metadata (9.6 kB)\n",
      "INFO: pip is still looking at multiple versions of pyfume to determine which version is compatible with other requirements. This could take a while.\n",
      "  Obtaining dependency information for pyfume from https://files.pythonhosted.org/packages/65/37/f470b796cecceccb42e6ca440591b3eaac4d08193f1f47c4f0a9f4e9da22/pyFUME-0.2.19-py3-none-any.whl.metadata\n",
      "  Downloading pyFUME-0.2.19-py3-none-any.whl.metadata (9.6 kB)\n",
      "  Obtaining dependency information for pyfume from https://files.pythonhosted.org/packages/e7/aa/5ccb542159afc5c4245079b799f634f02e87794326b18e7a0ba4eb66ecf5/pyFUME-0.2.18-py3-none-any.whl.metadata\n",
      "  Downloading pyFUME-0.2.18-py3-none-any.whl.metadata (9.6 kB)\n",
      "  Obtaining dependency information for pyfume from https://files.pythonhosted.org/packages/7a/98/59f4e466b69f823120461f392c97a4179dd2617aa4911526ccc62d55c686/pyFUME-0.2.17-py3-none-any.whl.metadata\n",
      "  Downloading pyFUME-0.2.17-py3-none-any.whl.metadata (9.6 kB)\n",
      "  Obtaining dependency information for pyfume from https://files.pythonhosted.org/packages/ea/0c/6a48798c650834c4737046693f5361d3b0fe6a394fff1bc9351b66797a10/pyFUME-0.2.15-py3-none-any.whl.metadata\n",
      "  Downloading pyFUME-0.2.15-py3-none-any.whl.metadata (9.6 kB)\n",
      "  Obtaining dependency information for pyfume from https://files.pythonhosted.org/packages/4c/6a/0a3245373813b95562dab5e5ea7e94b938e1ba3e4899997566c5a1280cc1/pyFUME-0.2.14-py3-none-any.whl.metadata\n",
      "  Downloading pyFUME-0.2.14-py3-none-any.whl.metadata (9.6 kB)\n",
      "INFO: This is taking longer than usual. You might need to provide the dependency resolver with stricter constraints to reduce runtime. See https://pip.pypa.io/warnings/backtracking for guidance. If you want to abort this run, press Ctrl + C.\n",
      "  Obtaining dependency information for pyfume from https://files.pythonhosted.org/packages/d1/91/08c7b40b52c4af8ad60509aaaf415588ca66532685290adb54812d8d8bee/pyFUME-0.2.13-py3-none-any.whl.metadata\n",
      "  Downloading pyFUME-0.2.13-py3-none-any.whl.metadata (9.6 kB)\n",
      "  Obtaining dependency information for pyfume from https://files.pythonhosted.org/packages/76/d3/5488998fd3695cb5ba953e2607ee67f930342970a205360b5dd1f046964b/pyFUME-0.2.12-py3-none-any.whl.metadata\n",
      "  Downloading pyFUME-0.2.12-py3-none-any.whl.metadata (9.6 kB)\n",
      "  Obtaining dependency information for pyfume from https://files.pythonhosted.org/packages/0a/53/0e79a01b47ff3bdeeb83d5d52abc863152f3b2a1e3774cc6e46e496b4a49/pyFUME-0.2.11-py3-none-any.whl.metadata\n",
      "  Downloading pyFUME-0.2.11-py3-none-any.whl.metadata (9.6 kB)\n",
      "  Obtaining dependency information for pyfume from https://files.pythonhosted.org/packages/6f/81/57ac04fc77fcc9cb77510e38630cedb2caa62c198d5a1b79628dbc6577d0/pyFUME-0.2.10-py3-none-any.whl.metadata\n",
      "  Downloading pyFUME-0.2.10-py3-none-any.whl.metadata (9.6 kB)\n",
      "  Obtaining dependency information for pyfume from https://files.pythonhosted.org/packages/2c/53/3c62c3366fff954600477491cc2d4c6ebd9a117b64cae7b611663b06b821/pyFUME-0.2.9-py3-none-any.whl.metadata\n",
      "  Downloading pyFUME-0.2.9-py3-none-any.whl.metadata (9.6 kB)\n",
      "  Obtaining dependency information for pyfume from https://files.pythonhosted.org/packages/5a/1d/e48e363f51ce4b738500af0ae8c9e6b43304ae49d183fafab902f2b3cb5d/pyFUME-0.2.8-py3-none-any.whl.metadata\n",
      "  Downloading pyFUME-0.2.8-py3-none-any.whl.metadata (9.6 kB)\n",
      "  Obtaining dependency information for pyfume from https://files.pythonhosted.org/packages/cc/b7/e573ce5c63cdaae9ef60e5fb67947d5f8c455ec42d19bbfbea867ba19067/pyFUME-0.2.7-py3-none-any.whl.metadata\n",
      "  Downloading pyFUME-0.2.7-py3-none-any.whl.metadata (9.6 kB)\n",
      "  Obtaining dependency information for pyfume from https://files.pythonhosted.org/packages/f6/66/ca4317323b9476277e899af7e54fb5af57e12cc53d43c311d16327cee1b9/pyFUME-0.2.6-py3-none-any.whl.metadata\n",
      "  Downloading pyFUME-0.2.6-py3-none-any.whl.metadata (9.6 kB)\n",
      "  Obtaining dependency information for pyfume from https://files.pythonhosted.org/packages/7d/a1/1ccb8dbcbe30c309a1e3623e4019b99d220592cb0afa85b53ee8beb9f167/pyFUME-0.2.5-py3-none-any.whl.metadata\n",
      "  Downloading pyFUME-0.2.5-py3-none-any.whl.metadata (9.6 kB)\n",
      "  Obtaining dependency information for pyfume from https://files.pythonhosted.org/packages/b3/3a/b7943afe9c4c2abbc87ecfe050df048572b9ec6e65991714f94d9985964c/pyFUME-0.2.4-py3-none-any.whl.metadata\n",
      "  Downloading pyFUME-0.2.4-py3-none-any.whl.metadata (9.6 kB)\n",
      "  Obtaining dependency information for pyfume from https://files.pythonhosted.org/packages/b6/83/1cdf0575a720b86ca10e25646d8f5aabf84b07775a65d0b2f7c797797609/pyFUME-0.2.3-py3-none-any.whl.metadata\n",
      "  Downloading pyFUME-0.2.3-py3-none-any.whl.metadata (9.6 kB)\n",
      "  Obtaining dependency information for pyfume from https://files.pythonhosted.org/packages/91/88/6c989736bd890919c2aaf44cd503008bf301e7db5167e3dc5ec120460fd1/pyFUME-0.2.2-py3-none-any.whl.metadata\n",
      "  Downloading pyFUME-0.2.2-py3-none-any.whl.metadata (9.6 kB)\n",
      "  Obtaining dependency information for pyfume from https://files.pythonhosted.org/packages/9a/5a/959a9cc706a4b00268e7a12516a19eb64fe2d1162456d0a78a6a7fb0e594/pyFUME-0.2.1-py3-none-any.whl.metadata\n",
      "  Downloading pyFUME-0.2.1-py3-none-any.whl.metadata (9.6 kB)\n",
      "  Obtaining dependency information for pyfume from https://files.pythonhosted.org/packages/79/d8/232988cebfe36c7d182b69ceb4db8ba4471267f11962e1dd39258e2d3eb4/pyFUME-0.2.0-py3-none-any.whl.metadata\n",
      "  Downloading pyFUME-0.2.0-py3-none-any.whl.metadata (9.6 kB)\n",
      "  Obtaining dependency information for pyfume from https://files.pythonhosted.org/packages/84/b4/89df8b127041c9bfb57d5cf369d8d11bdcb24a42fdc3db447b822334a1e2/pyFUME-0.1.29-py3-none-any.whl.metadata\n",
      "  Downloading pyFUME-0.1.29-py3-none-any.whl.metadata (9.6 kB)\n",
      "  Obtaining dependency information for pyfume from https://files.pythonhosted.org/packages/2c/30/89369148677bef275fe81d37523ccf3f7e23c126bdc5a418a83a4791ffd6/pyFUME-0.1.28-py3-none-any.whl.metadata\n",
      "  Downloading pyFUME-0.1.28-py3-none-any.whl.metadata (9.6 kB)\n",
      "  Obtaining dependency information for pyfume from https://files.pythonhosted.org/packages/5b/eb/ef64578c56cea6b29aceaf53ddec6336e9429cbc6a2aa5ca8c154f3e1571/pyFUME-0.1.27-py3-none-any.whl.metadata\n",
      "  Downloading pyFUME-0.1.27-py3-none-any.whl.metadata (9.6 kB)\n",
      "  Obtaining dependency information for pyfume from https://files.pythonhosted.org/packages/f4/fe/80caef99caa8be8c17f8865dd0925c04fbec82388b70accf9a72b8ea7ad3/pyFUME-0.1.26-py3-none-any.whl.metadata\n",
      "  Downloading pyFUME-0.1.26-py3-none-any.whl.metadata (9.6 kB)\n",
      "  Obtaining dependency information for pyfume from https://files.pythonhosted.org/packages/2a/10/e33398b2849210dbfc1260a8d425cd305301bbd8419f35dbb177e69c54ba/pyFUME-0.1.25-py3-none-any.whl.metadata\n",
      "  Downloading pyFUME-0.1.25-py3-none-any.whl.metadata (9.6 kB)\n",
      "  Obtaining dependency information for pyfume from https://files.pythonhosted.org/packages/47/46/ab1cd0716ca24ec1ed96795959834c460df2ccde41da3c030a78240c9cec/pyFUME-0.1.24-py3-none-any.whl.metadata\n",
      "  Downloading pyFUME-0.1.24-py3-none-any.whl.metadata (9.6 kB)\n",
      "  Obtaining dependency information for pyfume from https://files.pythonhosted.org/packages/30/d3/cf692bc500b80e2ad93e6e3bbcd38bb266f6a40b40b0fe52e399491d9d12/pyFUME-0.1.23-py3-none-any.whl.metadata\n",
      "  Downloading pyFUME-0.1.23-py3-none-any.whl.metadata (6.7 kB)\n",
      "  Obtaining dependency information for pyfume from https://files.pythonhosted.org/packages/10/c7/b5d21914839bc715c68b9640f4b68b55d96806fc103c2ecff6272104a9f3/pyFUME-0.1.22-py3-none-any.whl.metadata\n",
      "  Downloading pyFUME-0.1.22-py3-none-any.whl.metadata (6.7 kB)\n",
      "  Obtaining dependency information for pyfume from https://files.pythonhosted.org/packages/4e/02/238c01c7e97b276286d817d3766d9349823da87d66db8cfd2fe2cd1492a5/pyFUME-0.1.21-py3-none-any.whl.metadata\n",
      "  Downloading pyFUME-0.1.21-py3-none-any.whl.metadata (6.7 kB)\n",
      "  Obtaining dependency information for pyfume from https://files.pythonhosted.org/packages/a1/2a/7bb52216e0bcfa25f88afc37a872f97fe85ecfb014ad281b5b2ffb565ff7/pyFUME-0.1.20-py3-none-any.whl.metadata\n",
      "  Downloading pyFUME-0.1.20-py3-none-any.whl.metadata (6.5 kB)\n",
      "  Obtaining dependency information for pyfume from https://files.pythonhosted.org/packages/65/33/197d0e9ca7d41adfd994fa735db39d487be1d5ad3bfb16fbe01b60febbf5/pyFUME-0.1.19-py3-none-any.whl.metadata\n",
      "  Downloading pyFUME-0.1.19-py3-none-any.whl.metadata (6.5 kB)\n",
      "  Obtaining dependency information for pyfume from https://files.pythonhosted.org/packages/98/0d/66c5c68bd468ab431a39cef2a97348fdd50c3582e13e406a8c4c85ebfdbc/pyFUME-0.1.18-py3-none-any.whl.metadata\n",
      "  Downloading pyFUME-0.1.18-py3-none-any.whl.metadata (6.5 kB)\n",
      "  Obtaining dependency information for pyfume from https://files.pythonhosted.org/packages/4a/65/0c70a37d9eda6c6faa1cc59289b3b45296eb7292a55707b6a15ef05a1bb0/pyFUME-0.1.17-py3-none-any.whl.metadata\n",
      "  Downloading pyFUME-0.1.17-py3-none-any.whl.metadata (6.5 kB)\n",
      "  Obtaining dependency information for pyfume from https://files.pythonhosted.org/packages/bb/94/7b769ecb9bc45e8ad3dbf0dc627ef38df1ffb9d9fb0c626258345a01074e/pyFUME-0.1.16-py3-none-any.whl.metadata\n",
      "  Downloading pyFUME-0.1.16-py3-none-any.whl.metadata (6.5 kB)\n",
      "  Obtaining dependency information for pyfume from https://files.pythonhosted.org/packages/b6/02/8bb33ce42871e45aadcb8fea53d3aaa15f1b8a84d130ed7a970b6cb647d1/pyFUME-0.1.15-py3-none-any.whl.metadata\n",
      "  Downloading pyFUME-0.1.15-py3-none-any.whl.metadata (6.5 kB)\n",
      "  Obtaining dependency information for pyfume from https://files.pythonhosted.org/packages/7d/99/6acd1186535489406c3e7ab63b53af65a5272cd4768274596d3ba77af85b/pyFUME-0.1.14-py3-none-any.whl.metadata\n",
      "  Downloading pyFUME-0.1.14-py3-none-any.whl.metadata (6.3 kB)\n",
      "  Obtaining dependency information for pyfume from https://files.pythonhosted.org/packages/72/78/2f871ff89ab8c508cb0e5b9fc702362e1e16be08a031907bf19ff6ad96ba/pyFUME-0.1.13-py3-none-any.whl.metadata\n",
      "  Downloading pyFUME-0.1.13-py3-none-any.whl.metadata (6.4 kB)\n",
      "  Obtaining dependency information for pyfume from https://files.pythonhosted.org/packages/b6/3f/29e9859d29885242cd3ef376a2cd8dfe62212942e83a12563814ef73ff0a/pyFUME-0.1.12-py3-none-any.whl.metadata\n",
      "  Downloading pyFUME-0.1.12-py3-none-any.whl.metadata (6.4 kB)\n",
      "  Obtaining dependency information for pyfume from https://files.pythonhosted.org/packages/8a/2f/39f5f3395e41968e314af771ba0b3785f9cf8b7e1d658a8cc39f451c7fba/pyFUME-0.1.11-py3-none-any.whl.metadata\n",
      "  Downloading pyFUME-0.1.11-py3-none-any.whl.metadata (6.3 kB)\n",
      "  Obtaining dependency information for pyfume from https://files.pythonhosted.org/packages/58/9f/7dfe096d1486ffe0ae203d0b6a0e754c85ba1a7ccf56c59c08b708d50dc8/pyFUME-0.1.10-py3-none-any.whl.metadata\n",
      "  Downloading pyFUME-0.1.10-py3-none-any.whl.metadata (6.3 kB)\n",
      "  Obtaining dependency information for pyfume from https://files.pythonhosted.org/packages/e1/8c/c34d70d06f768d64ff01957172280e1a2dcb5d8ea596272d92e2694856b5/pyFUME-0.1.9-py3-none-any.whl.metadata\n",
      "  Downloading pyFUME-0.1.9-py3-none-any.whl.metadata (6.3 kB)\n",
      "  Obtaining dependency information for pyfume from https://files.pythonhosted.org/packages/e2/1f/de1bf01bd5d7e1858d7b2843e7cad6cf3a813e586361dc6d9d2fc6316aae/pyFUME-0.1.8-py3-none-any.whl.metadata\n",
      "  Downloading pyFUME-0.1.8-py3-none-any.whl.metadata (6.3 kB)\n",
      "  Obtaining dependency information for pyfume from https://files.pythonhosted.org/packages/39/a6/c76fbedc32bf4de22a8e43299b66aa4570929244be2d29a22cc96690fa68/pyFUME-0.1.7-py3-none-any.whl.metadata\n",
      "  Downloading pyFUME-0.1.7-py3-none-any.whl.metadata (6.3 kB)\n",
      "  Obtaining dependency information for pyfume from https://files.pythonhosted.org/packages/e6/c3/1ebd8894872364a6a08eed9f6a68b173e8e8c1b2c8929b76216a49bb1cdb/pyFUME-0.1.6-py3-none-any.whl.metadata\n",
      "  Downloading pyFUME-0.1.6-py3-none-any.whl.metadata (6.3 kB)\n",
      "  Obtaining dependency information for pyfume from https://files.pythonhosted.org/packages/04/04/fa2c8c289cbf3c836e6ef529c8d7ea4a7f7b43db28ee3e8bd1d95fd2a917/pyFUME-0.1.5-py3-none-any.whl.metadata\n",
      "  Downloading pyFUME-0.1.5-py3-none-any.whl.metadata (6.3 kB)\n",
      "  Obtaining dependency information for pyfume from https://files.pythonhosted.org/packages/19/33/21dbf1d0ec421cc7d99d116090620d47a636e0f8bfa4498405d6842d197c/pyFUME-0.1.4-py3-none-any.whl.metadata\n",
      "  Downloading pyFUME-0.1.4-py3-none-any.whl.metadata (4.7 kB)\n",
      "  Obtaining dependency information for pyfume from https://files.pythonhosted.org/packages/27/ec/4e90c60bf6824fcac35e9a940d69a497163ed828338d4eed1ba2b967a0ea/pyFUME-0.1.3-py3-none-any.whl.metadata\n",
      "  Downloading pyFUME-0.1.3-py3-none-any.whl.metadata (4.7 kB)\n",
      "  Obtaining dependency information for pyfume from https://files.pythonhosted.org/packages/ed/a9/f665d55c90c8d006ce65523422f7022d27ef9a31384458b38db0781aefae/pyFUME-0.1.2-py3-none-any.whl.metadata\n",
      "  Downloading pyFUME-0.1.2-py3-none-any.whl.metadata (4.7 kB)\n",
      "  Obtaining dependency information for pyfume from https://files.pythonhosted.org/packages/00/eb/5f7deabda267e738cbd99fe3023d0555dd3afdd2e11a19f9f1688d1d8fe3/pyFUME-0.0.1-py3-none-any.whl.metadata\n",
      "  Downloading pyFUME-0.0.1-py3-none-any.whl.metadata (1.0 kB)\n",
      "Requirement already satisfied: six>=1.5 in d:\\py\\lib\\site-packages (from python-dateutil>=2.8.1->pandas->FuzzyTM>=0.4.0->gensim) (1.16.0)\n",
      "Downloading FuzzyTM-2.0.9-py3-none-any.whl (31 kB)\n",
      "Downloading pyFUME-0.0.1-py3-none-any.whl (13 kB)\n",
      "Installing collected packages: pyfume, FuzzyTM\n",
      "Successfully installed FuzzyTM-2.0.9 pyfume-0.0.1\n",
      "Note: you may need to restart the kernel to use updated packages.\n"
     ]
    }
   ],
   "source": [
    "\n",
    "pip install gensim --only-binary=:all:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "01a2e4a5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[('movie', 0.9055121541023254),\n",
      " ('films', 0.8914433717727661),\n",
      " ('directed', 0.8124362826347351),\n",
      " ('documentary', 0.8075793981552124),\n",
      " ('drama', 0.7929168939590454),\n",
      " ('movies', 0.7889865040779114),\n",
      " ('comedy', 0.7842751145362854),\n",
      " ('starring', 0.7573285102844238),\n",
      " ('cinema', 0.7419455647468567),\n",
      " ('hollywood', 0.7307389974594116)]\n",
      "[('vehicle', 0.8630837798118591),\n",
      " ('truck', 0.8597878813743591),\n",
      " ('cars', 0.837166965007782),\n",
      " ('driver', 0.8185911178588867),\n",
      " ('driving', 0.781263530254364),\n",
      " ('motorcycle', 0.7553156614303589),\n",
      " ('vehicles', 0.7462257146835327),\n",
      " ('parked', 0.74594646692276),\n",
      " ('bus', 0.737270712852478),\n",
      " ('taxi', 0.7155269384384155)]\n"
     ]
    }
   ],
   "source": [
    "# 使用most_similar()找到词表中距离给定词最近（最相似）的n个词\n",
    "pprint.pprint(model.most_similar('film'))\n",
    "pprint.pprint(model.most_similar('car'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "8b62f7ad",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "japanese\n",
      "panda\n",
      "longest\n",
      "terrible\n",
      "queen\n"
     ]
    }
   ],
   "source": [
    "# 利用GloVe展示一个类比的例子\n",
    "def analogy(x1, x2, y1):\n",
    "    # 寻找top-N最相似的词。\n",
    "    result = model.most_similar(positive=[y1, x2], negative=[x1])\n",
    "    return result[0][0]\n",
    "\n",
    "print(analogy('china', 'chinese', 'japan'))\n",
    "print(analogy('australia', 'koala', 'china'))\n",
    "print(analogy('tall', 'tallest', 'long'))\n",
    "print(analogy('good', 'fantastic', 'bad'))\n",
    "print(analogy('man', 'woman', 'king'))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0c308cee",
   "metadata": {},
   "source": [
    "下面将展示word2vec的代码，包括文本预处理、skipgram算法的实现、以及使用PyTorch进行优化。这里使用《小王子》这本书作为训练语料。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "590fc408",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package punkt to C:\\Users\\Mr\n",
      "[nltk_data]     Jige\\AppData\\Roaming\\nltk_data...\n",
      "[nltk_data]   Unzipping tokenizers\\punkt.zip.\n"
     ]
    }
   ],
   "source": [
    "# 安装NLTK，使用如下代码下载punkt组件\n",
    "import nltk\n",
    "nltk.download('punkt')\n",
    "\n",
    "from nltk.tokenize import sent_tokenize, word_tokenize\n",
    "from collections import defaultdict\n",
    "\n",
    "# 使用类管理数据对象，包括文本读取、文本预处理等\n",
    "class TheLittlePrinceDataset:\n",
    "    def __init__(self, tokenize=True):\n",
    "        # 利用NLTK函数进行分句和分词\n",
    "        text = open('F:/vscode/社会舆情/the little prince.txt', 'r', encoding='utf-8').read()\n",
    "        if tokenize:\n",
    "            self.sentences = sent_tokenize(text.lower())\n",
    "            self.tokens = [word_tokenize(sent) for sent in self.sentences]\n",
    "        else:\n",
    "            self.text = text\n",
    "\n",
    "    def build_vocab(self, min_freq=1):\n",
    "        # 统计词频\n",
    "        frequency = defaultdict(int)\n",
    "        for sentence in self.tokens:\n",
    "            for token in sentence:\n",
    "                frequency[token] += 1\n",
    "        self.frequency = frequency\n",
    "\n",
    "        # 加入<unk>处理未登录词，加入<pad>用于对齐变长输入进而加速\n",
    "        self.token2id = {'<unk>': 1, '<pad>': 0}\n",
    "        self.id2token = {1: '<unk>', 0: '<pad>'}\n",
    "        for token, freq in sorted(frequency.items(), key=lambda x: -x[1]):\n",
    "            # 丢弃低频词\n",
    "            if freq > min_freq:\n",
    "                self.token2id[token] = len(self.token2id)\n",
    "                self.id2token[len(self.id2token)] = token\n",
    "            else:\n",
    "                break\n",
    "\n",
    "    def get_word_distribution(self):\n",
    "        distribution = np.zeros(vocab_size)\n",
    "        for token, freq in self.frequency.items():\n",
    "            if token in dataset.token2id:\n",
    "                distribution[dataset.token2id[token]] = freq\n",
    "            else:\n",
    "                # 不在词表中的词按<unk>计算\n",
    "                distribution[1] += freq\n",
    "        distribution /= distribution.sum()\n",
    "        return distribution\n",
    "\n",
    "    # 将分词结果转化为索引表示\n",
    "    def convert_tokens_to_ids(self, drop_single_word=True):\n",
    "        self.token_ids = []\n",
    "        for sentence in self.tokens:\n",
    "            token_ids = [self.token2id.get(token, 1) for token in sentence]\n",
    "            # 忽略只有一个token的序列，无法计算loss\n",
    "            if len(token_ids) == 1 and drop_single_word:\n",
    "                continue\n",
    "            self.token_ids.append(token_ids)\n",
    "        \n",
    "        return self.token_ids\n",
    "\n",
    "dataset = TheLittlePrinceDataset()\n",
    "dataset.build_vocab(min_freq=1)\n",
    "sentences = dataset.convert_tokens_to_ids()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "efc882de",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(76044, 2) [[  4  16]\n",
      " [  4  19]\n",
      " [ 16   4]\n",
      " ...\n",
      " [130   3]\n",
      " [  3  86]\n",
      " [  3 130]]\n"
     ]
    }
   ],
   "source": [
    "# 遍历所有的中心词-上下文词对\n",
    "window_size = 2\n",
    "data = []\n",
    "\n",
    "for sentence in sentences:\n",
    "    for i in range(len(sentence)):\n",
    "        for j in range(i-window_size, i+window_size+1):\n",
    "            if j == i or j < 0 or j >= len(sentence):\n",
    "                continue\n",
    "            center_word = sentence[i]\n",
    "            context_word = sentence[j]\n",
    "            data.append([center_word, context_word])\n",
    "\n",
    "# 需要提前安装numpy\n",
    "import numpy as np\n",
    "data = np.array(data)\n",
    "print(data.shape, data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "30903b3d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 需要提前安装PyTorch\n",
    "import torch\n",
    "from torch import nn\n",
    "import torch.nn.functional as F\n",
    "\n",
    "# 实现skipgram算法，使用对比学习计算损失\n",
    "class SkipGramNCE(nn.Module):\n",
    "    def __init__(self, vocab_size, embed_size, distribution,\\\n",
    "                 neg_samples=20):\n",
    "        super(SkipGramNCE, self).__init__()\n",
    "        print(f'vocab_size = {vocab_size}, embed_size = {embed_size}, '+\\\n",
    "              f'neg_samples = {neg_samples}')\n",
    "        self.input_embeddings = nn.Embedding(vocab_size, embed_size)\n",
    "        self.output_embeddings = nn.Embedding(vocab_size, embed_size)\n",
    "        distribution = np.power(distribution, 0.75)\n",
    "        distribution /= distribution.sum()\n",
    "        self.distribution = torch.tensor(distribution)\n",
    "        self.neg_samples = neg_samples\n",
    "        \n",
    "    def forward(self, input_ids, labels):\n",
    "        i_embed = self.input_embeddings(input_ids)\n",
    "        o_embed = self.output_embeddings(labels)\n",
    "        batch_size = i_embed.size(0)\n",
    "        n_words = torch.multinomial(self.distribution, batch_size * \\\n",
    "            self.neg_samples, replacement=True).view(batch_size, -1)\n",
    "        n_embed = self.output_embeddings(n_words)\n",
    "        pos_term = F.logsigmoid(torch.sum(i_embed * o_embed, dim=1))\n",
    "        # 负采样，用于对比学习\n",
    "        neg_term = F.logsigmoid(- torch.bmm(n_embed, \\\n",
    "            i_embed.unsqueeze(2)).squeeze())\n",
    "        neg_term = torch.sum(neg_term, dim=1)\n",
    "        loss = - torch.mean(pos_term + neg_term)\n",
    "        return loss"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "1d9da6c8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0.00000000e+00 5.43983724e-02 5.34295679e-02 ... 9.68804495e-05\n",
      " 9.68804495e-05 9.68804495e-05]\n",
      "vocab_size = 1078, embed_size = 128, neg_samples = 20\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "epoch-99, loss=2.7625: 100%|█| 100/100 [03:25<00:00,  2.05s/\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAjMAAAGwCAYAAABcnuQpAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAABL+ElEQVR4nO3de1xUdf4/8NdcmOEOAgoioCh4y/Ba5iW1vKVmmtt20TXbrmam5pauufvLtk1aK7Oy7PLd1bZytfKSXTSxFDU1FVEJ74oCKqLcYWCGmfn8/hjmwHARmBnmzOjr+XjwKGbOwIejwov3+3NRCCEEiIiIiDyUUu4BEBERETmCYYaIiIg8GsMMEREReTSGGSIiIvJoDDNERETk0RhmiIiIyKMxzBAREZFHU8s9gJZmNptx6dIlBAQEQKFQyD0cIiIiagIhBEpKShAZGQml8vq1lxs+zFy6dAnR0dFyD4OIiIjskJWVhaioqOtec8OHmYCAAACWmxEYGCjzaIiIiKgpiouLER0dLf0cv54bPsxYW0uBgYEMM0RERB6mKVNEOAGYiIiIPBrDDBEREXk0hhkiIiLyaAwzRERE5NEYZoiIiMijMcwQERGRR2OYISIiIo/GMENEREQejWGGiIiIPBrDDBEREXk0hhkiIiLyaAwzRERE5NFu+IMmW0pJRSWKyivh46VCqL9W7uEQERHdtFiZsdNne85j8L+2462tJ+UeChER0U2NYcZOXirLrTMYhcwjISIiurkxzNhJXRVmKk1mmUdCRER0c2OYsZNGpQDAMENERCQ3hhk7eUmVGbaZiIiI5MQwYycvtpmIiIjcAsOMndRsMxEREbkFhhk7aViZISIicgsMM3binBkiIiL3wDBjJy81KzNERETugGHGTl5KzpkhIiJyBwwzdqquzLDNREREJCeGGTtxaTYREZF7YJixkxeXZhMREbkFhhk7cTUTERGRe2CYsZMUZoyszBAREcmJYcZOUpvJzDBDREQkJ4YZO2nYZiIiInILDDN2UleFGZNZwGRmoCEiIpILw4ydrG0mgCuaiIiI5MQwYyfrBGAAMLIyQ0REJBuGGTvVDDNc0URERCQfhhk7qZQKVB3PxDYTERGRjBhmHGCtzhgYZoiIiGTDMOMA6/JsI5dnExERyYZhxgHVJ2ezMkNERCQXhhkHqKsmzbDNREREJB+GGQfwsEkiIiL5Mcw4QKO2zplhZYaIiEguDDMOsO4CzDYTERGRfBhmHKBWss1EREQkN4YZB0irmbgDMBERkWwYZhygqWozcWk2ERGRfBhmHCCtZuJBk0RERLJhmHGAWsU2ExERkdwYZhzANhMREZH8GGYcUL1pHsMMERGRXBhmHMAdgImIiOTHMOMANdtMREREsmOYcYCGbSYiIiLZMcw4wNpmMrDNREREJBuGGQdYwwwPmiQiIpIPw4wDvNScM0NERCQ3hhkHePGgSSIiItkxzDiges4MKzNERERyYZhxgLXNxDkzRERE8mGYcYCGm+YRERHJjmHGAWqlpTLDNhMREZF8GGYc4KXmqdlERERyc5swk5iYCIVCgTlz5kiPCSGwaNEiREZGwsfHB8OGDUN6erp8g6xF2mfGzDYTERGRXNwizBw4cACffPIJEhISbB5fsmQJli5diuXLl+PAgQOIiIjAyJEjUVJSItNIbfE4AyIiIvnJHmZKS0sxZcoUfPrpp2jVqpX0uBACy5Ytw8KFCzFp0iT06NEDn332GXQ6HVavXt3gx9Pr9SguLrZ5aynWgyYNbDMRERHJRvYw89xzz2HcuHEYMWKEzeMZGRnIycnBqFGjpMe0Wi2GDh2KPXv2NPjxEhMTERQUJL1FR0e32Ni9WJkhIiKSnaxhZs2aNTh06BASExPrPJeTkwMACA8Pt3k8PDxceq4+CxYsQFFRkfSWlZXl3EHXoOGcGSIiItmp5frEWVlZmD17NrZu3Qpvb+8Gr1MoFDbvCyHqPFaTVquFVqt12jivR9oBmG0mIiIi2chWmUlJSUFubi769u0LtVoNtVqN5ORkvPfee1Cr1VJFpnYVJjc3t061Ri7WOTNsMxEREclHtjAzfPhwpKWl4fDhw9Jbv379MGXKFBw+fBgdO3ZEREQEkpKSpNcYDAYkJydj4MCBcg3bhhd3ACYiIpKdbG2mgIAA9OjRw+YxPz8/hIaGSo/PmTMHixcvRnx8POLj47F48WL4+vpi8uTJcgy5DmnODCszREREspEtzDTFvHnzUF5ejhkzZqCgoAD9+/fH1q1bERAQIPfQAFQfNGlgZYaIiEg2CiHEDf2TuLi4GEFBQSgqKkJgYKBTP/aZ3FKMWJqMIB8vHHllVOMvICIioiZpzs9v2feZ8WTcAZiIiEh+DDMOsLaZjGwzERERyYZhxgHSPjMmM27wbh0REZHbYphxgJey+vZxF2AiIiJ5MMw4wNpmAjhvhoiISC4MMw6wtpkAbpxHREQkF4YZB6iVrMwQERHJjWHGAQqFAl48n4mIiEhWDDMOks5nMrLNREREJAeGGQdJYcbMygwREZEcGGYc5MVdgImIiGTFMOMgac4M20xERESyYJhxUM1dgImIiMj1GGYcxNVMRERE8mKYcZC1MsPDJomIiOTBMOMgTgAmIiKSF8OMg6xtJs6ZISIikgfDjINYmSEiIpIXw4yDNGrOmSEiIpITw4yDrIdNss1EREQkD4YZB7HNREREJC+GGQd5qa0HTTLMEBERyYFhxkEa6z4zZs6ZISIikgPDjIM4Z4aIiEheDDMOqm4zsTJDREQkB4YZB2k4AZiIiEhWDDMOkg6aNDPMEBERyYFhxkFqFdtMREREcmKYcRD3mSEiIpIXw4yDNNY2E8MMERGRLBhmHFRdmWGbiYiISA4MMw5Ss81EREQkK4YZB7HNREREJC+GGQdxAjAREZG8GGYcxDkzRERE8mKYcZCabSYiIiJZMcw4iMcZEBERyYthxkHWNpOBbSYiIiJZMMw4yHpqtpGVGSIiIlkwzDjIi3NmiIiIZMUw4yCuZiIiIpIXw4yDpDkzRlZmiIiI5MAw4yBrm8loZpghIiKSA8OMgzRsMxEREcmKYcZB0kGTbDMRERHJgmHGQdY2k4GrmYiIiGTBMOMga5vJaGabiYiISA4MMw6yrmYymQVMDDREREQuxzDjIOtBkwA3ziMiIpIDw4yDrJUZgGGGiIhIDgwzDqoZZoxcnk1ERORyDDMOUikVUCl5PhMREZFcGGacQK3k8mwiIiK5MMw4AXcBJiIikg/DjBN4qav2mmFlhoiIyOUYZpyAuwATERHJh2HGCdRKtpmIiIjkwjDjBBq1NcywMkNERORqDDNOYG0z8eRsIiIi12OYcQLrxnmVPJuJiIjI5RhmnEBtDTOszBAREbkcw4wTaFTcAZiIiEguDDNOYG0zcWk2ERGR6zHMOIE1zPCgSSIiIteTNcysWLECCQkJCAwMRGBgIAYMGIDNmzdLzwshsGjRIkRGRsLHxwfDhg1Denq6jCOunxfbTERERLKRNcxERUXhjTfewMGDB3Hw4EHcfffdmDBhghRYlixZgqVLl2L58uU4cOAAIiIiMHLkSJSUlMg57Dqk1UwMM0RERC4na5gZP348xo4di86dO6Nz5854/fXX4e/vj3379kEIgWXLlmHhwoWYNGkSevTogc8++ww6nQ6rV6+Wc9h1VM+ZYZuJiIjI1dxmzozJZMKaNWtQVlaGAQMGICMjAzk5ORg1apR0jVarxdChQ7Fnz54GP45er0dxcbHNW0urnjPDygwREZGryR5m0tLS4O/vD61Wi+nTp2PDhg3o3r07cnJyAADh4eE214eHh0vP1ScxMRFBQUHSW3R0dIuOH+CcGSIiIjnJHma6dOmCw4cPY9++fXj22Wcxbdo0HDt2THpeoVDYXC+EqPNYTQsWLEBRUZH0lpWV1WJjt2KbiYiISD5quQeg0WgQFxcHAOjXrx8OHDiAd999F/PnzwcA5OTkoG3bttL1ubm5dao1NWm1Wmi12pYddC2cAExERCQf2SsztQkhoNfrERsbi4iICCQlJUnPGQwGJCcnY+DAgTKOsC4vtaVSxDkzREREridrZebll1/GmDFjEB0djZKSEqxZswY7duzAli1boFAoMGfOHCxevBjx8fGIj4/H4sWL4evri8mTJ8s57Dq8lNbKDNtMREREriZrmLly5QqmTp2Ky5cvIygoCAkJCdiyZQtGjhwJAJg3bx7Ky8sxY8YMFBQUoH///ti6dSsCAgLkHHYdPM6AiIhIPrKGmX//+9/XfV6hUGDRokVYtGiRawZkJ2ubiadmExERuZ7bzZnxRBrrPjNmtpmIiIhcjWHGCdRKS2WGbSYiIiLXY5hxAi911QRgtpmIiIhcjmHGCbjPDBERkXwYZpyAc2aIiIjkwzDjBOqqs5kMbDMRERG5HMOME7DNREREJB+GGSfQqLgDMBERkVwYZpyAlRkiIiL5MMw4gXXODMMMERGR6zHMOIEX20xERESyYZhxAg3bTERERLJhmHEC6aBJVmaIiIhcjmHGCdRKVmaIiIjkwjDjBGwzERERyYdhxgmq20wMM0RERK7GMOMENVczCcF5M0RERK7EMOMEXsrq28jDJomIiFyLYcYJrG0mgK0mIiIiV2OYcQJrmwkAKo2szBAREbkSw4wTqJU1KjNmVmaIiIhciWHGCRQKBZdnExERyYRhxkmkwybZZiIiInIphhknsc6bMbAyQ0RE5FIMM07ixTYTERGRLOwKM5999hl++OEH6f158+YhODgYAwcOxIULF5w2OE+iqWozGXnYJBERkUvZFWYWL14MHx8fAMDevXuxfPlyLFmyBGFhYXjhhRecOkBPoWabiYiISBZqe16UlZWFuLg4AMDGjRvxwAMP4Omnn8agQYMwbNgwZ47PY3ipeD4TERGRHOyqzPj7+yMvLw8AsHXrVowYMQIA4O3tjfLycueNzoNwzgwREZE87KrMjBw5Ek8++SR69+6NU6dOYdy4cQCA9PR0dOjQwZnj8xgatSXMcM4MERGRa9lVmfnggw8wYMAAXL16FevWrUNoaCgAICUlBY888ohTB+gprLsAc84MERGRa9lVmQkODsby5cvrPP7qq686PCBPxTYTERGRPOyqzGzZsgW7d++W3v/ggw/Qq1cvTJ48GQUFBU4bnCextpkYZoiIiFzLrjDz0ksvobi4GACQlpaGv/zlLxg7dizOnTuHuXPnOnWAnqK6MsM5M0RERK5kV5spIyMD3bt3BwCsW7cO9957LxYvXoxDhw5h7NixTh2gp7DOmWFlhoiIyLXsqsxoNBrodDoAwLZt2zBq1CgAQEhIiFSxudl4WdtMRoYZIiIiV7KrMjN48GDMnTsXgwYNwv79+7F27VoAwKlTpxAVFeXUAXoKDdtMREREsrCrMrN8+XKo1Wp88803WLFiBdq1awcA2Lx5M+655x6nDtBTSDsAm1mZISIiciW7KjMxMTH4/vvv6zz+zjvvODwgT2U9m6nSyMoMERGRK9kVZgDAZDJh48aNOH78OBQKBbp164YJEyZApVI5c3weQ8N9ZoiIiGRhV5g5c+YMxo4di4sXL6JLly4QQuDUqVOIjo7GDz/8gE6dOjl7nG6PB00SERHJw645M7NmzUKnTp2QlZWFQ4cOITU1FZmZmYiNjcWsWbOcPUaPwH1miIiI5GFXZSY5ORn79u1DSEiI9FhoaCjeeOMNDBo0yGmD8yRqtpmIiIhkYVdlRqvVoqSkpM7jpaWl0Gg0Dg/KE2nYZiIiIpKFXWHm3nvvxdNPP43ffvsNQggIIbBv3z5Mnz4d9913n7PH6BGsbSaemk1ERORadoWZ9957D506dcKAAQPg7e0Nb29vDBw4EHFxcVi2bJmTh+gZrGHGyDkzRERELmXXnJng4GB8++23OHPmDI4fPw4hBLp37464uDhnj89jcDUTERGRPJocZho7DXvHjh3S/y9dutTuAXkqL04AJiIikkWTw0xqamqTrlMoFHYPxpNVz5lhm4mIiMiVmhxmtm/f3pLj8HjWU7ONrMwQERG5lF0TgKkuLyXnzBAREcmBYcZJ2GYiIiKSB8OMk1jbTJVGVmaIiIhciWHGSaxLs41mhhkiIiJXYphxEh40SUREJA+GGSeR5sywzURERORSDDNOwh2AiYiI5MEw4yQa69lMZraZiIiIXIlhxknUKq5mIiIikgPDjJNY20wGtpmIiIhcimHGSTQ8aJKIiEgWDDNOYl3NZBaAifNmiIiIXIZhxknUqurTwlmdISIich2GGSexVmYAhhkiIiJXkjXMJCYm4rbbbkNAQADatGmDiRMn4uTJkzbXCCGwaNEiREZGwsfHB8OGDUN6erpMI26YbZhhm4mIiMhVZA0zycnJeO6557Bv3z4kJSXBaDRi1KhRKCsrk65ZsmQJli5diuXLl+PAgQOIiIjAyJEjUVJSIuPI61IpFVApq85nYmWGiIjIZdRyfvItW7bYvL9y5Uq0adMGKSkpGDJkCIQQWLZsGRYuXIhJkyYBAD777DOEh4dj9erVeOaZZ+QYdoPUSgVMZsHl2URERC7kVnNmioqKAAAhISEAgIyMDOTk5GDUqFHSNVqtFkOHDsWePXvq/Rh6vR7FxcU2b66i4WGTRERELuc2YUYIgblz52Lw4MHo0aMHACAnJwcAEB4ebnNteHi49FxtiYmJCAoKkt6io6NbduA1aL0st7PcYHLZ5yQiIrrZuU2YmTlzJo4ePYr//e9/dZ5TKBQ27wsh6jxmtWDBAhQVFUlvWVlZLTLe+oT5awEAV0oqmvW6Ur0RZu5NQ0REZBe3CDPPP/88Nm3ahO3btyMqKkp6PCIiAgDqVGFyc3PrVGustFotAgMDbd5cJTLYBwBwubDpYSa7QId+/0zC7LWHW2hURERENzZZw4wQAjNnzsT69evxyy+/IDY21ub52NhYREREICkpSXrMYDAgOTkZAwcOdPVwG9U2yBsAcLmovMmv2Z+Rj4pKM45kFbbQqIiIiG5ssq5meu6557B69Wp8++23CAgIkCowQUFB8PHxgUKhwJw5c7B48WLEx8cjPj4eixcvhq+vLyZPnizn0OtlrcxcakZl5nRuKQBAZzC2yJiIiIhudLKGmRUrVgAAhg0bZvP4ypUr8dhjjwEA5s2bh/LycsyYMQMFBQXo378/tm7dioCAABePtnGRwc2vzJypCjOleoYZIiIie8gaZoRofNKrQqHAokWLsGjRopYfkIPaBlXNmSlqemXmbFWYqag0w2QW0sZ7RERE1DRuMQH4RhEZZG0zlTcpqOmNJpzPq97tmK0mIiKi5mOYcaLwIMvSbL3RjAJdZaPXn7+mQ80V2WV67k9DRETUXAwzTqRVq6S9Zi4VNj5vxjpfxqqMlRkiIqJmY5hxsupJwI3Pmzmda3tYZhknARMRETUbw4yTNWevmTqVGbaZiIiImo1hxsnaBjV9r5naYYYTgImIiJqPYcbJmrrXjMkscO6aZSVTu6rN9rjXDBERUfMxzDiZtNdMI5WZrHwdDEYztGolukZYNgDU8bRtIiKiZmOYcTJrZeZSI5UZa4upY2t/BHhb9i7kBGAiIqLmY5hxMmtl5kpxBczmhjfOs57JFN/GH75aa5hhZYaIiKi5GGacrE2AFkoFUGkSuFaqb/A6a2Umro0//DQqAJwATEREZA+GGSdTq5QID7S0mi5eZ+O8M1drhJmqygwnABMRETUfw0wLqN5rpv5JwEII6YDJ+Db+8NNYwgwnABMRETUfw0wLiAyuPnCyPjnFFSjVG6FSKtA+1I+VGSIiIgcwzLQAa5hpqDJjnS/TPtQXGrUSflrOmSEiIrIXw0wLaOxIg9NXqltMAOCr4WomIiIiezHMtIDGjjSoOfkXgFSZ4T4zREREzccw0wIaO9LgjDT517LzLycAExER2Y9hpgVYKzO5JXpUmsx1nq+5xwxQozLDOTNERETNxjDTAkL9NNColBDCshNwTfllBuSXGQAAHVv7AYC0moltJiIiouZjmGkBSqUCEQ3sNWOtykS18pEm/lr/W2kSMBjrVnKIiIioYQwzLcS6oqn2XjOnc0sAVLeYAEjHGQCszhARETUXw0wLaWivGWm+TOvqMKNWKaFVW/4oOG+GiIioeRhmWoi010ztyox1j5lwf5vHrfNmuKKJiIioeRhmWkhb65EGNSozWfk67D2XBwDoFd3K5nrriiYeaUBERNQ8DDMtJLKeXYA/3nkWJrPAnfFh6BIRYHO9tNcMdwEmIiJqFoaZFmLda+Zy1S7AucUV+OpgNgDgubvi6lzPwyaJiIjswzDTQqy7AOeVGVBRacK/d2fAYDSjb/tW6B8bUud6Xw0PmyQiIrIHw0wLCfLxgo+XJaCczCnBF/suAACeu6sTFApFneutbaYyTgAmIiJqFoaZFqJQKNC2qjqTuPk4ygwmdI0IwF1d2tR7PXcBJiIisg/DTAtqV7Wiad+5fADAjLvi6q3KANWrmXQMM0RERM3CMNOCrHvNAECHUF+Mu7Vtg9dajzQo5WomIiKiZmGYaUHWFU0A8MzQTlAp66/KAIC/lhOAiYiI7MEw04KiWlnCTHigFpP6tLvutb6cAExERGQXtdwDuJGNS2iL1KxCTOgZCa1add1r/TkBmIiIyC4MMy3IV6PG4vtvbdq1VW0mhhkiIqLmYZvJTVTvM8MwQ0RE1BwMM25COjWbq5mIiIiahWHGTViPM2BlhoiIqHkYZtxE9Q7ArMwQERE1B8OMm7DuAFxmMEIIIfNoiIiIPAfDjJuwTgAWAiivZHWGiIioqRhm3ISPlwrWY5vYaiIiImo6hhk3oVQq4OvFIw2IiIiai2HGjfhqrYdNMswQERE1FcOMG7EeaaDj+UxERERNxjDjRqx7zbAyQ0RE1HQMM26EuwATERE1H8OMG/HjLsBERETNxjDjRnylXYAZZoiIiJqKYcaN+Gs4AZiIiKi5GGbciK+WE4CJiIiai2HGjViPNNAxzBARETUZw4wbkU7OrqfNdLmoHEXlla4eEhERkdtjmHEj0snZtSozhToD7n4rGQ9+tFeOYREREbk1hhk3Ym0z1a7MnMgpQXmlCadyS2AyCzmGRkRE5LYYZtyItTJTe85MVr4OACCEpUpDRERE1Rhm3Iivpv6DJrMKyqX/zy9jmCEiIqqJYcaN+DVw0GR2VWUGAPIYZoiIiGwwzLiRhiYAZxVUhxlWZoiIiGwxzLiR6gnAtefMVLeZWJkhIiKyxTDjRqxtpopKs7RqqaLShCslFdI1+aUMM0RERDUxzLgR36pTs4Hq6szFwnKIGqux88v0rh4WERGRW2OYcSNatRJqpQIAoNNbJgFn1Zj8C7DNREREVBvDjBtRKBRSdca6PNu6LFthyTgo4D4zRERENmQNMzt37sT48eMRGRkJhUKBjRs32jwvhMCiRYsQGRkJHx8fDBs2DOnp6fIM1kWql2dbwox1WXan1v4AgDzOmSEiIrIha5gpKytDz549sXz58nqfX7JkCZYuXYrly5fjwIEDiIiIwMiRI1FSUuLikbqOdNiktc1UtSy7Z1QwAC7NJiIiqk0t5ycfM2YMxowZU+9zQggsW7YMCxcuxKRJkwAAn332GcLDw7F69Wo888wz9b5Or9dDr6+eJFtcXOz8gbcgP43tXjPWZdm9ooOw7lA2CnQGCCGgsPadiIiIbnJuO2cmIyMDOTk5GDVqlPSYVqvF0KFDsWfPngZfl5iYiKCgIOktOjraFcN1GqkyY7DOmbFUZhKqKjOVJoHiCmO9ryUiIroZuW2YycnJAQCEh4fbPB4eHi49V58FCxagqKhIesvKymrRcTqb9XymMr0JJRWVKNRVAgDi2vhLVRu2moiIiKrJ2mZqitrtlMZaLFqtFlqttqWH1WKkk7MNRqnFFOKngZ9WjRB/Dcryy5FfpkdsmJ+cwyQiInIbbluZiYiIAIA6VZjc3Nw61ZobSc0JwJlVK5miW/kAAEJ8NQCA/LJKeQZHRETkhtw2zMTGxiIiIgJJSUnSYwaDAcnJyRg4cKCMI2tZ0gRggxHZVfNlokJ8AVgqNAB3ASYiIqpJ1jZTaWkpzpw5I72fkZGBw4cPIyQkBDExMZgzZw4WL16M+Ph4xMfHY/HixfD19cXkyZNlHHXLqq7MGJFVaVmeHSOFGUv7jLsAExERVZM1zBw8eBB33XWX9P7cuXMBANOmTcOqVaswb948lJeXY8aMGSgoKED//v2xdetWBAQEyDXkFiednK03SquWoltZwkyof1VlhhvnERERSWQNM8OGDYOoeYpiLQqFAosWLcKiRYtcNyiZ+WqtbSaTdC5TdEjVnBmpzcQwQ0REZOW2c2ZuVv412kzZVecyWSsz1jDDNhMREVE1hhk3Y91nJjNfh/JKExQKIDLYUpkJZWWGiIioDoYZN2NdzWStyrQN9IZGbfljamVHmLleG4+IiOhGwDDjZqyrmaysy7KB5ldm9p7NQ+/XkrAhNdt5AyQiInIzDDNuxroDsJV1vgxQPWemvNKEcoOp0Y+19VgOCnWV2JzW8PEPREREno5hxs3UrsxYVzIBlsnBGpXljyyvCRvnnbtaBgC4kKdz4giJiIjcC8OMm7FOALaKqdFmUigUzVqenXGtKszkl3HuDBER3bAYZtyMdQKwVXSNMAM0fXm23miSjkOoqDQjt4RHIBAR0Y2JYcbNqFVKaNXVfyw158wATd8FODNPB3ONYsz5qioNERHRjYZhxg1Z581o1Eq0CdDaPNfKt2ltprNXbcML580QEdGNimHGDflWtZqign2gVCpsnpPmzOiuH2YyalViLuSzMkNERDcmhhk3ZD3SIKrWfBmgxl4zjbSZzl0tBQAE+3oBAM6zMkNERDcohhk3ZK3MRLfyqfNciH/TJgBbKzND4lsDAC7ksTJDREQ3JoYZN2SdM1N7JRNQcxfg669OOlcVZu7u2gYAcOGajsuziYjohsQw44aGdWmDYF8vqapSU4ifZULw9SYAF+oM0vNDOreGQgGU6I31vqZUb8SC9WnYn5HvpNETERG5FsOMG3picCxS/z4S3SMD6zzXlH1mrFWZiEBvhPhp0DbQGwBwIb/uvJlvDmbhf/szMe+bI6zcEBGRR2KYcVMKhaLex61tppIKIwxGc73XWI8x6NjaDwAQE2ppV9U3byY1qxCAZYLwwQsFDo2ZiIhIDgwzHibIxwvW1doFDSzPzrhmWckUG2YJMx1CLf89f61uZeZwVZgBgK8OZDlxpERERK7BMONhlEpFoxvnVVdm/AEA7avCTO3KTH6ZwWYzvR/SLqNMb3T6mImIiFoSw4wHauywSeuy7I5SZaaqzVRrzsyR7EIAlgpOxzA/6Awm/JB2uSWGTERE1GIYZjzQ9SYBm82iOszUmTNjG2YOZxYCAHpHB+OBflEAgK8PstVERESehWHGA1UfNll3r5lLReXQG83wUinQLtiy6Z61zZRfZkBReaV0rXW+TK+YYPyhTxSUCuDA+QJp92AiIiJPwDDjga7XZrLOl2kf6ge1yvLH669VI8zfsj9NZlV1RgghtZl6RQcjPNAbQztb9rX5OiW7RcdPRETkTAwzHsi6cV59baba82WsqufNWJ6/kKdDoa4SGrUSXSMs+9k82C8aALD+UDaMpvqXfRMREbkbhhkPFFJ1eGT9lZmqZdmtbcNM9YomS2XG2mK6JTIQGrXlr8HwbuEI8dPgSrEeu05fa5GxExERORvDjAcK8W+4MmPd/bdTmL/N4+2rKjPnq56X5stEB0vXaNRKTOgVCQD4ihOBiYjIQzDMeCDrLsAF15kzU7cyY7uiKbWeMAMAf+xraTVtO34FefVMMCYiInI3DDMeqKEJwBWVJlwqKgdQ35yZql2A88qgN5pw/FIxAKB3dCub67pHBiIhKgiVJoHP911okfETERE5E8OMB5IqMzoDzObqwyHP55VBCCDQWy0FHitrmMkt0ePQhUIYTGaE+GkQHeJT5+M/PaQjAGDlr+dRyh2BiYjIzTHMeKBWVUHFLIDCGvvG1DzGoPZBlUG+Xgiumji86chFAEDPqKB6D7Qc06MtYsP8UFReidW/sTpDRETujWHGA3mplAj0VgMA8suq57XU3vm3tvYhlnkzP6blAAB61WoxWamUCjw7tBMA4NNdGaioNDln4ERERC2AYcZDSUcalFbPmzlbtSy79nwZK+vybOsuwL1ighv8+BN7t0NkkDeulujxDTfRIyIiN8Yw46HqmwRcXZnxr/c11o3zrHpGBTX48TVqpTR35qPks03aRO9kTgkuFpY3eh0REZEzqeUeANnHugvwd0cv4XB2Ia6W6HH8smWFUmwjlRnrNcG+mnqvs3rothi8/8sZZBeUY9ORS5jUJ6re6ypNZry19SQ+Tj4HtVKBP93RHrOGx9eZhKw3mrD79DW08tOgT0z9LS4iIqLmYpjxUK0DLGHGOv/FKsBbfZ0wU12Zqb2/TH18NCo8PjgWb/50Eh/uOIuJvdpBqbSdMJyVr8Pz/0uVNuEzmgVW7TmPdYeyMfOuOEwb2AFpF4uwIfUifjh6GUXllVApFVj/7ED0bMIYiIiIGsMw46Gm3tEe10r18PZSobW/Fm0CtWjtr0Wf9q3g7aWq9zU1KzNNCTMAMHVAe3yUfBZnckvx5f5M3BkXhiAfLwR4q7H12BXMX3cUJRVGBHqrseSBBPhrvfD6j8dx/HIxEjefwNtJp2AwVreoNGolDEYzXvjqMH6cdWeDYyUiImoqhRBCNH6Z5youLkZQUBCKiooQGBgo93BkJYRAz1e3orjCiE0zByEhKrhJr3vrp5NYvv1Mg8/3iQnGuw/3RnTVaimTWWBD6kW89dNJ5BRXwE+jwphb22JS73bo2jYQ9yzbidwSPf48qANeGX+LM740IiK6wTTn5zfDzE0m6dgVZObr8PigDvXuMVOfQp0Bz60+hIyrZSgqr0SZwbJUW6kApg/thBdGdoaXqu5c8nKDCaeulKBzeAB8NNUVmB0nc/HYygMAgNVP9sfAuLDrfv4zuaX45w/HMLZHWzx4W3RTv1QiopvSok3p+P1iET6e2hehVWf5eSKGmRoYZpzPaDKjpMIIlUqBQG8vuz7Gwg1p+PK3TEQGeWPLC0Ma/DgFZQZM/PBX6Uyp1yb2wNQ72te5bn9GPhZuSMOguDC8Mr57k4OaswghkH6pGPHh/tCq2TojInlsTc/B05+nAADG9IjAh1P6uPz7obM05+c3l2ZTs6lVSrTy09gdZADg5bHd0D7UF5eKKvDqpmP1XlNpMmPGl4dwIU8H36rKzt83/o6vDtie6P3VwSxM+b99OJ1bilV7zuPtrafsHpc9zl0txcOf7MO97+/Gs18cwg3++wER1ZCVr8O493ZhwfqjKJP5+BedwYhXv6v+frr59xxsOnJJxhG5DsMMycJPq8bbf+wJpQJYdygbH2w/A73RdqfhV79Lx95zefDTqLBhxiD8eVAHAMD89UexITUbJrPA4h+PY943R1FpEtLqqOXbz2DtgcwmjSMtuwiJm4/j873nkZZdZDNZuTEGoxnv/3wa97y7C79l5AMAfjmRi5+P5zb4GiEELhWWY3PaZSRuPo6HP9mLCR/8iu0nGn4NEbmvj5LPIv1SMf63Pwvj39+N9EtFso3l3Z9P42JhOaJa+eCZoZZ9wv7ft+nILa6wuc5sFvg4+SweX3VAWonaVKV6o1vuCs82E8nqzZ9O4IPtZwEAUa188OKoLrivZyS+/O0C/v5tOhQK4NOp/TCieziEEPj7t7/ji32ZUCosK7IOZRYCAGYNj8ec4fFYtu0U3vvlDFRKBVY+dhuGdG5d7+c1mQU+Sj6Ld5JOwVjjsE6NWokekYEY06MtnhgcW2cpulVqZgHmrzuKU1csuy4P6dwakUHeWHMgC+1DfbH1hSF12k1nr5biqf8elM7Qqu2JwbGYd08Xm9fpDEas/i0Te8/mYc6Izrj1OhsdNuZiYTneSTqF0bdEYGT3cLs/zs3m053nkJFXhqfv7IgODWx70JIyrpXBx0uFiCDvJr/mt3N5mPvVEUzoFYmXRndpUpuhotKEpUmn8Nu5PEzuH4NJfaLqnQvnTCazgFIBt2mDnLtaigt5OkSH+CI6xKfRlnFxRSXuWPwzdAYTgn29UKirhEatxN/GdcPUO9q79Os6kVOMe9/bDaNZ4D+P9cOd8a1x/4e/4veLxbi7axv8e1o/KBQKFFdUYu7aI9h2/AoAy9zHPw+KxV9GdYavpuEFzkIIfJ2SjVe+TUeInwZrnr5DWvTRUjhnpgaGGfdmNgusPZiFd5JOIbfEcs5U14gAnM4thcksMP+ernh2WCeb6xesT8Pag5ZWk0atxJsPJGBCr3YALP/g5n51BBtSL8Jfq8bX0wegW1vbP/dLheV4Ye1hqZoytHNrCABHsgqlox4AYEKvSLz5QE9o1Lbf0L8+mIWXN6Sh0iQQ4qfBK+O7476ekSgzmHD3WzuQW6LHvHu6YMawOOk1hToD7v9wDzKulUGlVKBrRAB6RgejV1Qwjl0uxqo95wEAPdoF4r2HeyMsQIvP917Av3dnSLs8B3ir8fkT/etdVp9fZsDW9ByM6B6OsHom/JXqjXhgxR6cyCmBQgG8PvFWTO4f05Q/IpfTGYzw8VJd9weB0WSGUqFoMGw6y4Hz+fjjR3sBQNoQ8vm74647qbLSZMaB8/k4eL4AvaKDGwzU13O1RI9NRy5hQ2o2fr9YDK1aiRdHdcHjg2OhauRr1hmMGPXOTmQXWHbjfmxgh0bnkR3KLMCLXx3BuWvVQTsmxBcz747D/b3bOT3UZBfo8NZPJ/HtkUuIDfPDg/2iMal3O7QJrA5sQgicz9PhZE4xbo0KRrtgnwY/1pItJxHoo8akPlHoHR1sV4g4klWIhz/Zh/KqqoNCAbQN9EanNv54aXSXeld/rvw1A69+dwydw/2x5ukBmPfNEWyrqsyOviUciZMS6mwe6qhfz1xDcXklhncLl743mc0CD368FwcvFOCeWyLw0dS+AIBTV0pw73u7YTCZseSBBPSKDsYzn6cg41oZNGolBnYKxY6TVwEA0SE+SLw/AYPj6y7IKNUb8bcNadh4uLpl1T7UF18/M8Dmz8zZGGZqYJjxDDqDEf/ZnYGPks+htKrvfH/vdlj6YM8635hMZoHXvj+G/Rn5eP3+Huhdazdhg9GMR//zG/ady0frAC3ujA9DiK8Grfw0UCoUWLHjDIorjPDVqLDovlvwx75RUCgU0jfPn49fwRubT8BoFhgcF4aPpvaFv1YNs1ngza0nsWKHpZJ0zy0RSJx0q3SKOQCsP5SNuV8dga9Ghe0vDkN4oDcqTWZM+89+7Dmbh3bBPtjw3EC0CbD9BrDt2BW89M0RFOgq4atRQaVUoKTCch/ah/oiyMcLR7OLEKBV479P3G7zNSefuooXvz6CqyV6xIT44osn+iOmxgaJZrPA05+nYNvxK9I+PwDw1zFdMX1oJ9R29mopfr9YhFNXSnAypxSnrpRAZzDhodui8OTgjjZf7/Wcu1qKC/k6eKtV0HopoVUrEejthahWPg3+sPnP7gy8/uNxdAkPwKzh8RjVPdwmsGTl67Ai+Sy+OZgNP60Kd3QMxYBOoRjYKRSd6jkt3hFCCExasQepmYVoE6CVwra/Vo3pQzuib/sQKBWAUqmAUgFk5uuw7Xgudp68ipIacyfG3doWr4zvXuebvtkskHaxCFkFOhTqKlFUbnk7daUEu05fg8lc91tz75hgvPlAT8S1qf/IEgB4/Ydj+HRXBoJ8vKRwPvWO9vjHhFvq3J+KShPe2XYKn+48B7MAwgO1+EOfKHx1MAvXqs59ax/qi/6xISgzmKDTG1FmMMFPo8JLo7uie2TzvqcWV1Tiw+1n8Z9fM+q0dFVKBYZ1bo2e0cE4ml2IQ5mFUpDXqpV4/u44PDWko1QtEUJg/aGLWLQp3eZ+d2zthz/0icKkPu3QNqj+AFTb+Wtl+MOKPcgrM6B1gFb6Oq0ig7yx7S9DbSoXZrPAiKXJOHetTFqYIITAyl/PI3HzcVSaBML8tUicdKtTKqEFZQa8+l26FCjC/LV45PZoTO4fg52nrmL+ujT4alTYNncoImsEv4+Sz+KNzSfgp1FBANAZTIgM8sZHU/siISoYO07mYuGG36WjaAZ0DMXtsSHo274VescE40KeZWNU6y9izw3rhA2HLyIrvxxdwgOw9pk7Gt1N3l4MMzUwzHiW/DIDPt55Fjq9CQvHdbN7U70iXSX+8NEenMktrff5nlFBePfh3g22DZJPXcWzX6RAZzChR7tArJjSF6//cBxb0i07Ls+8Kw5zR3auUxkwmwUe+GgPDmUWYlLvdnj7wZ54ecPv+N/+TPhpVPjm2YF1KkVWOUUVmLM2FfvOWSpG8W38MfPuOIy7tS30RjP+vOoA9mfkw1+rxmeP345bIgPxxuYTUlVHoQCEANoEaPH5E/3RJSIAALBkywl8uOMsNGol1j59B5KOXcGHVYHs2WGdMG90FxTqKvHt4Yv4OiUb6ZeKG7yvfhoVpg7ogKfujG2wOpFTVIE3fzqJ9anZqO+7y4hubbDkgZ42v7EKIbBs22m8+/Npm2u7RgRg9vB4dIkIwIodZ7Eh9aJNW7CmtkHeeHFUF0zq067JoaZMb4SXSlmn+gYAPxy9jOdWH4KvRoUdLw7D6dxSJG4+jt8vNnx/rML8NegZFYwdp67CZBYI8FZj/j1dMfn2GBy7XIzvjlzC90cvX/css57RwZjUux3uTWiLpGNX8M8fjqNUb4RGrcTckZ3x5OBYqGtVTNKyizDhg90wC+A/j/XDtRID5q8/CiGAKf1j8NqEHpbrLhZhx8mr+PbwRakaM6lPO7xy7y0I8vWCzmDEF/su4OPkc8ircf5bTSF+Gqx9+g7EhwfUeW7P2Wv4dOc5GM0CPl4q+GhU0KiU2Hb8Cgp0loA1oGMo/jKqM85dLcNXB7Nw8EJBnY+jUSvRNshbWs3YsbUf/jmxB7pFBGLhxjRpB/Q+McFoH+qHzb9fRkVldUiKDfND7+hg9I4JRq/oVujWNqDOPcsr1eMPK/bgfJ4OPdoFYs3TA+CnUSGvzIALeWWYs/YwsvLLMWNYJ8y7p6v0ul2nr2Lqv/cjQKvGvpeHw09bHXR+v1iEF9Yexumq7z+T+rTDK+NvQZBP9aKJkopKFFcYERnk3ejf1y2/5+BvG3/HtVI9lArLvbeGTZVSAS+VAhWVZvxtXDc8eWdHm9eazAJ/rPqeBAADO4Xi/Ud62/z7LdUb8eaWE/jvvgs2/2YVCkClUMBoFogM8sZ7j/RGvw4hyMzT4YGP9iC3RI9e0cH48sn+Nl+/szDM1MAwc/MqrqjE1vQryCvVI19nQEGZAQW6SvSOCcZTd3ZstHR+NLsQf155AHllBigVgFkAXioF3piUgD/0rf+cKuvrJnzwK4QAHuoXjbUHs2zm/lyPySyw/lA2gny8MKKbbVVCZzDi8VUHsO+cJdBEBHlLYW3agPZ4YnBHPP35QZzIKUGQjxdW/fk2ZObrMHvNYQDAsod6YWJvSzvu4+SzSNx8AgBwa7sgnMgpRqXJ8q3AS6VAQlQwukQEoEt4ADqHB6Co3ID3fzkjBR0fLxXG3Bph+e0tuhU6h/tDbzTj453n8OnOc1KpvmtEAExmAb3RDL3RhLxSA4xmgTYBWrzzUC8MiguD2Szwj++PSaFs1t1xMAmBz/ZckKp0NQ2OC8Nzd8VBo1Zi79lr2HsuDwfPF0Bf9Zt+/9gQvH5/D8S1qf4hK4TAhTwdDpzPx6krJTh1pRRncktxsbAc0SE+WP2kbf/fYDRj5DvJuJCnw+zh8XhhZGcAlrD63dFLWLXnPEoqjDALASEAs7AElqGdW2N4t3D0igqGUqlA+qUiLFifhqPZlkmh1nkVVn4aFW6JDEKwrxeCfLwQ7OuFMH8tRnQPR6daB8ZeKizHgvVpSD5laQvcHhuC9x7uLc2lMZrMmPDBr0i/VIzxPSPx/iO9AQDfpGTjpW+OQAjLD/0LeTqbgBLmr8Xi+3tg1C0Rde61zmDExtRLKNAZ4KdRwVerhq9GhU92nsPR7CK0CdDiq2cGSL8UCGE5zuSfPxyvt7IEAJ1a++Hlsd1wd9c2Nj/Ez14txbqUbFwsLMet7YLQp30r3BIZCI1KiU1HLuG1749JP8ADvNUoqTBCrVRgzoh4TB/aCWqVEiUVldicloNvDmVjf1UbuaYwfw0eui0aj9weg6hWvig3mPDIp/twOKsQUa18sH5G3app0rEreOq/B+GlUmDLnCHSn8tT/z2IpGNX8NjADlh0X93NP2tXvSICvTEoLgwX8spwPq9M+lpiw/wwpkcExt7aFrdEBkpV4ktFFTh1pQTrUrLx/dHLAIC4Nv5484EE9GgXhK3pV/DZ3vPS19mtbSC+mzmoTlgDgAt5ZZj3zVHcHhuC2cPj673G+mew52weUs7nIyWzAFn5lrA9sns43nwgwaYCc+pKCR78eC8KdZUYFBeKf0+7zek7ujPM1MAwQ444f60Mj/5nPzLzdWjl64WPp/bD7bEhjb5u3jdH8NXBbOn9l8d2xdND6rZ0mktnMOKJVQex91weAMsPojf/mIC7urQBYJmb8+dVB5CaWQhfjQpGs4DBaMb0oZ3w1zFdbT7W//Zn4uUNadJvYj3aBeKBPlGY0Ktdva0kIQR+Pp6Ld38+jbSLtis2fLxU8PZSSr9192vfCn+7t3ud+T3HLhXj+f8dwtmrZVAogGeGdMLVEj3WHbLcq1fvuwXTBnaQvpb/7M7Ayl/Po0RvxF1dWuP54fH1HlJaUWnCqj3nsWzbKVRUmuGlUuDpIR1xS2QQdp2+hl2nr0pzSOoT1coHa58ZIM3LsM6FCPPXIvmlYQ791mkyC/x373m89dNJlBlM0KiVGN61De7rGYm7urZp1g8AIQS+PpiNV79LR5nBhBA/Dd7+Y0/c1bWNFFCDfLywbe5Q6fw2wNL+fPHrI7DmC3+tGoPjwjCsS2uM6dEWQb7N22ahoMyARz7dhxM5JWgX7IO1z9yB1gFa/L+N6dJ8tom9IjGkc2uUV5pQbjBBbzQjMtgb4xMiG/xhej1F5ZV466eT+OI3S/WgU2s/LHuod4OT4gt1BhzOKkRqZmHVfwtQXNW6VSqAu7q0gcFkxq7T1xDs64V1zw6sEyAByz1/4rOD+OVELu6MD8N/H78d2QXlGPrmdpgFsG3u0Ou2/VIuFODFr48g41rdif8qpcIm9MWE+CLUX4MzV0ptWmfWDUpnDY+v8/flRE4xfj6eiwm9IhHVyrkTcnOLK1BUXom4NvW3cI9kFWLyp/tQZjDhvp6ReK8qQDsLw0wNDDPkqGulemxMvYjRt0Q0efb+1RI97n5rB0r0RvyxbxSWPJDgtPkc5QYT/t+3v0PAMu+l9oTfMr0R079Iwa7T1wBY2jofT+1X78TRHSdzkXKhAGNvbdtg+6s2IQT2nM3D3rN5SM0qwJGsIqmCEhPiiwVjuuKeHhENfr3lBhP+8f0x/G9/9fJ5lVKBNx9IqPdk9jK9EWUGY53fmOuTla/Dok3p+Lmepe5eKgV6RQfjlsggxIf7o3N4AFr5euHJzw7ifJ4OMSG+WPvMHfDVqDHsze0o0FXi9ft7YEr/ups02iO3pAInc0rQKzoYAQ7s0QRYVjjNXH1IqpRN6R+DdYeyUVFpmej5YL+6O2VvP5mL1MxCDOwUir7tWzk8qfdqiR4PfbIX566WoX2oL8L8tUi5UAClwrKP1BODY1tkNU9adhGOZBfiD32ibHYWb0ylyYxtx67gi98u4NczedLjWrUSq5/qj77tG/4l5UJeGUa+sxMGoxkrpvTBkewifJR8FnfGh+HzJ/o3+rnLDSas3p8Jnd6IDmF+iA3zQ/tQXygUCmw/kYsf0y5j+8lcmxaZWqlAx9Z+6BoRiCcGx7rtwbx7z+bh+f+lYsWf+uC2Do3/otccDDM1MMyQXPacvYbUzEI8eWesy3cF1htNWPzDcVwt1eNff0hw+Ifn9ZjMAmevliK3WI/bYls1+WvdnHYZ89cdRYXRjOWP9K63zWEPIQS2HruCd5JOwWQWGBwfhjvjw9A/NrTeCsvlonI89PE+ZObrEBvmh9s7hGDtwSzEtfHHltl32lVFcAW90YTEH6vnTAGW+RBfPtnfZUuCLxeV48GP90rtiABvNZZP7oOhdqzgcqWzV0vx5b5M7D2XhxdHdcbwbo1P0F269STe++UMIoO8UV5pQoGuEp9M7eu0v7c6gxG7Tl+DwWhGl4gAdAj1q3culzsqN5iaFSybimGmBoYZIvdVUlGJikqzTUtEDhcLy/HQx3ttWlH/92jjc5zcwZbfL2PeN0chAHw3c7DL98LJytdh2sr98FarsHxyb3Ssp1VzIyg3mDDynWTp70i7YB/snHdXo0vlyX4MMzUwzBBRU2Tl6/DwJ/twsbAc/WNDsObpO9xmM7fGlOmN0BvNTt/TpKnMZgGFG21+11JqnntUew8scr7m/Px2/loqIiIPFB3ii6+mD8C6lGw8fFu0R/1g9tOq4SdjcaulNy90FyO7h+OR22Pw+8UiPHJ73XlJJB9WZoiIiMjt8NRsIiIiumkwzBAREZFHY5ghIiIij8YwQ0RERB6NYYaIiIg8GsMMEREReTSGGSIiIvJoDDNERETk0RhmiIiIyKMxzBAREZFHY5ghIiIij+YRYebDDz9EbGwsvL290bdvX+zatUvuIREREZGbcPsws3btWsyZMwcLFy5Eamoq7rzzTowZMwaZmZlyD42IiIjcgNufmt2/f3/06dMHK1askB7r1q0bJk6ciMTExEZfz1OziYiIPE9zfn6rXTQmuxgMBqSkpOCvf/2rzeOjRo3Cnj176n2NXq+HXq+X3i8qKgJguSlERETkGaw/t5tSc3HrMHPt2jWYTCaEh4fbPB4eHo6cnJx6X5OYmIhXX321zuPR0dEtMkYiIiJqOSUlJQgKCrruNW4dZqwUCoXN+0KIOo9ZLViwAHPnzpXeN5vNyM/PR2hoaIOvsVdxcTGio6ORlZXFFlYL4712Hd5r1+G9dh3ea9dx1r0WQqCkpASRkZGNXuvWYSYsLAwqlapOFSY3N7dOtcZKq9VCq9XaPBYcHNxSQwQABAYG8h+Hi/Beuw7vtevwXrsO77XrOONeN1aRsXLr1UwajQZ9+/ZFUlKSzeNJSUkYOHCgTKMiIiIid+LWlRkAmDt3LqZOnYp+/fphwIAB+OSTT5CZmYnp06fLPTQiIiJyA24fZh566CHk5eXhH//4By5fvowePXrgxx9/RPv27eUeGrRaLV555ZU6bS1yPt5r1+G9dh3ea9fhvXYdOe612+8zQ0RERHQ9bj1nhoiIiKgxDDNERETk0RhmiIiIyKMxzBAREZFHY5ix04cffojY2Fh4e3ujb9++2LVrl9xD8niJiYm47bbbEBAQgDZt2mDixIk4efKkzTVCCCxatAiRkZHw8fHBsGHDkJ6eLtOIbxyJiYlQKBSYM2eO9BjvtfNcvHgRf/rTnxAaGgpfX1/06tULKSkp0vO8185hNBrxt7/9DbGxsfDx8UHHjh3xj3/8A2azWbqG99o+O3fuxPjx4xEZGQmFQoGNGzfaPN+U+6rX6/H8888jLCwMfn5+uO+++5Cdne2cAQpqtjVr1ggvLy/x6aefimPHjonZs2cLPz8/ceHCBbmH5tFGjx4tVq5cKX7//Xdx+PBhMW7cOBETEyNKS0ula9544w0REBAg1q1bJ9LS0sRDDz0k2rZtK4qLi2UcuWfbv3+/6NChg0hISBCzZ8+WHue9do78/HzRvn178dhjj4nffvtNZGRkiG3btokzZ85I1/BeO8c///lPERoaKr7//nuRkZEhvv76a+Hv7y+WLVsmXcN7bZ8ff/xRLFy4UKxbt04AEBs2bLB5vin3dfr06aJdu3YiKSlJHDp0SNx1112iZ8+ewmg0Ojw+hhk73H777WL69Ok2j3Xt2lX89a9/lWlEN6bc3FwBQCQnJwshhDCbzSIiIkK88cYb0jUVFRUiKChIfPTRR3IN06OVlJSI+Ph4kZSUJIYOHSqFGd5r55k/f74YPHhwg8/zXjvPuHHjxOOPP27z2KRJk8Sf/vQnIQTvtbPUDjNNua+FhYXCy8tLrFmzRrrm4sWLQqlUii1btjg8JraZmslgMCAlJQWjRo2yeXzUqFHYs2ePTKO6MRUVFQEAQkJCAAAZGRnIycmxufdarRZDhw7lvbfTc889h3HjxmHEiBE2j/NeO8+mTZvQr18//PGPf0SbNm3Qu3dvfPrpp9LzvNfOM3jwYPz88884deoUAODIkSPYvXs3xo4dC4D3uqU05b6mpKSgsrLS5prIyEj06NHDKffe7XcAdjfXrl2DyWSqc9BleHh4nQMxyX5CCMydOxeDBw9Gjx49AEC6v/Xd+wsXLrh8jJ5uzZo1OHToEA4cOFDnOd5r5zl37hxWrFiBuXPn4uWXX8b+/fsxa9YsaLVaPProo7zXTjR//nwUFRWha9euUKlUMJlMeP311/HII48A4N/rltKU+5qTkwONRoNWrVrVucYZPzsZZuykUChs3hdC1HmM7Ddz5kwcPXoUu3fvrvMc773jsrKyMHv2bGzduhXe3t4NXsd77Tiz2Yx+/fph8eLFAIDevXsjPT0dK1aswKOPPipdx3vtuLVr1+KLL77A6tWrccstt+Dw4cOYM2cOIiMjMW3aNOk63uuWYc99dda9Z5upmcLCwqBSqeokydzc3DqplOzz/PPPY9OmTdi+fTuioqKkxyMiIgCA994JUlJSkJubi759+0KtVkOtViM5ORnvvfce1Gq1dD95rx3Xtm1bdO/e3eaxbt26ITMzEwD/XjvTSy+9hL/+9a94+OGHceutt2Lq1Kl44YUXkJiYCID3uqU05b5GRETAYDCgoKCgwWscwTDTTBqNBn379kVSUpLN40lJSRg4cKBMo7oxCCEwc+ZMrF+/Hr/88gtiY2Ntno+NjUVERITNvTcYDEhOTua9b6bhw4cjLS0Nhw8flt769euHKVOm4PDhw+jYsSPvtZMMGjSozhYDp06dkg7L5d9r59HpdFAqbX+sqVQqaWk273XLaMp97du3L7y8vGyuuXz5Mn7//Xfn3HuHpxDfhKxLs//973+LY8eOiTlz5gg/Pz9x/vx5uYfm0Z599lkRFBQkduzYIS5fviy96XQ66Zo33nhDBAUFifXr14u0tDTxyCOPcFmlk9RczSQE77Wz7N+/X6jVavH666+L06dPiy+//FL4+vqKL774QrqG99o5pk2bJtq1ayctzV6/fr0ICwsT8+bNk67hvbZPSUmJSE1NFampqQKAWLp0qUhNTZW2JGnKfZ0+fbqIiooS27ZtE4cOHRJ33303l2bL7YMPPhDt27cXGo1G9OnTR1o+TPYDUO/bypUrpWvMZrN45ZVXREREhNBqtWLIkCEiLS1NvkHfQGqHGd5r5/nuu+9Ejx49hFarFV27dhWffPKJzfO8185RXFwsZs+eLWJiYoS3t7fo2LGjWLhwodDr9dI1vNf22b59e73fn6dNmyaEaNp9LS8vFzNnzhQhISHCx8dH3HvvvSIzM9Mp41MIIYTj9R0iIiIieXDODBEREXk0hhkiIiLyaAwzRERE5NEYZoiIiMijMcwQERGRR2OYISIiIo/GMENEREQejWGGiIiIPBrDDBE5TYcOHbBs2bImX79jxw4oFAoUFha22JjcSXPvDxE1jVruARCRfIYNG4ZevXo57QfsgQMH4Ofn1+TrBw4ciMuXLyMoKMgpn5+Ibk4MM0R0XUIImEwmqNWNf7to3bp1sz62RqNBRESEvUMjIgLANhPRTeuxxx5DcnIy3n33XSgUCigUCpw/f15q/fz000/o168ftFotdu3ahbNnz2LChAkIDw+Hv78/brvtNmzbts3mY9ZuoygUCvzf//0f7r//fvj6+iI+Ph6bNm2Snq/dZlq1ahWCg4Px008/oVu3bvD398c999yDy5cvS68xGo2YNWsWgoODERoaivnz52PatGmYOHHidb/ePXv2YMiQIfDx8UF0dDRmzZqFsrIym7G/9tprmDx5Mvz9/REZGYn333/f5mNkZmZiwoQJ8Pf3R2BgIB588EFcuXLF5ppNmzahX79+8Pb2RlhYGCZNmmTzvE6nw+OPP46AgADExMTgk08+ue64iahxDDNEN6l3330XAwYMwFNPPYXLly/j8uXLiI6Olp6fN28eEhMTcfz4cSQkJKC0tBRjx47Ftm3bkJqaitGjR2P8+PHIzMy87ud59dVX8eCDD+Lo0aMYO3YspkyZgvz8/Aav1+l0eOutt/D5559j586dyMzMxIsvvig9/69//QtffvklVq5ciV9//RXFxcXYuHHjdceQlpaG0aNHY9KkSTh69CjWrl2L3bt3Y+bMmTbXvfnmm0hISMChQ4ewYMECvPDCC0hKSgJgqVBNnDgR+fn5SE5ORlJSEs6ePYuHHnpIev0PP/yASZMmYdy4cUhNTcXPP/+Mfv362XyOt99+G/369UNqaipmzJiBZ599FidOnLju+ImoEU45e5uIPNLQoUPF7NmzbR7bvn27ACA2btzY6Ou7d+8u3n//fen99u3bi3feeUd6H4D429/+Jr1fWloqFAqF2Lx5s83nKigoEEIIsXLlSgFAnDlzRnrNBx98IMLDw6X3w8PDxZtvvim9bzQaRUxMjJgwYUKD45w6dap4+umnbR7btWuXUCqVory8XBr7PffcY3PNQw89JMaMGSOEEGLr1q1CpVKJzMxM6fn09HQBQOzfv18IIcSAAQPElClTGhxH+/btxZ/+9CfpfbPZLNq0aSNWrFjR4GuIqHGszBBRvWpXFMrKyjBv3jx0794dwcHB8Pf3x4kTJxqtzCQkJEj/7+fnh4CAAOTm5jZ4va+vLzp16iS937ZtW+n6oqIiXLlyBbfffrv0vEqlQt++fa87hpSUFKxatQr+/v7S2+jRo2E2m5GRkSFdN2DAAJvXDRgwAMePHwcAHD9+HNHR0TbVK+u9sF5z+PBhDB8+/LpjqXk/FAoFIiIirns/iKhxnABMRPWqvSrppZdewk8//YS33noLcXFx8PHxwQMPPACDwXDdj+Pl5WXzvkKhgNlsbtb1Qog6j9VU+/nazGYznnnmGcyaNavOczExMdd9rfVzCSHqfN7aj/v4+Fz3YwHNvx9E1DhWZohuYhqNBiaTqUnX7tq1C4899hjuv/9+3HrrrYiIiMD58+dbdoC1BAUFITw8HPv375ceM5lMSE1Nve7r+vTpg/T0dMTFxdV502g00nX79u2zed2+ffvQtWtXAJYqTGZmJrKysqTnjx07hqKiInTr1g2Apery888/O/x1ElHzsDJDdBPr0KEDfvvtN5w/fx7+/v4ICQlp8Nq4uDisX78e48ePh0KhwN///ndZKgrPP/88EhMTERcXh65du+L9999HQUFBvVUTq/nz5+OOO+7Ac889h6eeegp+fn44fvw4kpKSbFYs/frrr1iyZAkmTpyIpKQkfP311/jhhx8AACNGjEBCQgKmTJmCZcuWwWg0YsaMGRg6dKjUknvllVcwfPhwdOrUCQ8//DCMRiM2b96MefPmtexNIbrJsTJDdBN78cUXoVKp0L17d7Ru3fq681/eeecdtGrVCgMHDsT48eMxevRo9OnTx4WjtZg/fz4eeeQRPProoxgwYIA0/8Xb27vB1yQkJCA5ORmnT5/GnXfeid69e+Pvf/872rZta3PdX/7yF6SkpKB379547bXX8Pbbb2P06NEALO2gjRs3olWrVhgyZAhGjBiBjh07Yu3atdLrhw0bhq+//hqbNm1Cr169cPfdd+O3335rmRtBRBKFaKzZTETkxsxmM7p164YHH3wQr732mt0fp0OHDpgzZw7mzJnjvMERkUuwzUREHuXChQvYunUrhg4dCr1ej+XLlyMjIwOTJ0+We2hEJBO2mYjIoyiVSqxatQq33XYbBg0ahLS0NGzbtk2ahEtENx+2mYiIiMijsTJDREREHo1hhoiIiDwawwwRERF5NIYZIiIi8mgMM0REROTRGGaIiIjIozHMEBERkUdjmCEiIiKP9v8BevIjPC6hKMIAAAAASUVORK5CYII=",
      "text/plain": [
       "<Figure size 640x480 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# 为对比学习负采样准备词频率分布\n",
    "vocab_size = len(dataset.token2id)\n",
    "embed_size = 128\n",
    "distribution = dataset.get_word_distribution()\n",
    "print(distribution)\n",
    "model = SkipGramNCE(vocab_size, embed_size, distribution)\n",
    "\n",
    "from torch.utils.data import DataLoader\n",
    "from torch.optim import SGD, Adam\n",
    "\n",
    "# 定义静态方法collate_batch批量处理数据，转化为PyTorch可以需要的张量类型\n",
    "class DataCollator:\n",
    "    @classmethod\n",
    "    def collate_batch(cls, batch):\n",
    "        batch = np.array(batch)\n",
    "        input_ids = torch.tensor(batch[:, 0], dtype=torch.long)\n",
    "        labels = torch.tensor(batch[:, 1], dtype=torch.long)\n",
    "        return {'input_ids': input_ids, 'labels': labels}\n",
    "\n",
    "# 定义训练参数以及训练循环\n",
    "epochs = 100\n",
    "batch_size = 128\n",
    "learning_rate = 1e-3\n",
    "epoch_loss = []\n",
    "\n",
    "data_collator = DataCollator()\n",
    "dataloader = DataLoader(data, batch_size=batch_size, shuffle=True,\\\n",
    "    collate_fn=data_collator.collate_batch)\n",
    "optimizer = Adam(model.parameters(), lr=learning_rate)\n",
    "model.zero_grad()\n",
    "model.train()\n",
    "\n",
    "# 需要提前安装tqdm\n",
    "from tqdm import trange\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "# 训练过程，每步读取数据，送入模型计算损失，并使用PyTorch进行优化\n",
    "with trange(epochs, desc='epoch', ncols=60) as pbar:\n",
    "    for epoch in pbar:\n",
    "        for step, batch in enumerate(dataloader):\n",
    "            loss = model(**batch)\n",
    "            pbar.set_description(f'epoch-{epoch}, loss={loss.item():.4f}')\n",
    "            loss.backward()\n",
    "            optimizer.step()\n",
    "            model.zero_grad()\n",
    "        epoch_loss.append(loss.item())\n",
    "    \n",
    "epoch_loss = np.array(epoch_loss)\n",
    "plt.plot(range(len(epoch_loss)), epoch_loss)\n",
    "plt.xlabel('training epoch')\n",
    "plt.ylabel('loss')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c9430e9a",
   "metadata": {},
   "source": [
    "TF-IDF加权\n",
    "\n",
    "定义词频率（term frequency）。注意到不同长度的文章词频率会有较大差距，不利于比较和运算，因此可以对词频率取对数。\n",
    "\n",
    "$$\\text{tf}_{t,d} = \\log (\\text{count}(t,d) + 1)$$\n",
    "\n",
    "其中$\\text{count}(t,d)$表示词$t$在文档$d$中出现的次数，为了避免对0取对数，把所有的计数加1。\n",
    "\n",
    "那么如何区分高频词与低频词呢？TF-IDF引入了另一个重要的评价指标——文档频率（document frequency），即一个词在语料库所包含的多少篇文档中出现。在所有文档里出现的词往往是虚词或是常见实词，而只在少量文档里出现的词往往是具有明确含义的实词并且具有很强的文档区分度。用$\\text{df}_t$来表示在多少篇文档中出现了词$t$。\n",
    "\n",
    "为了压低高频词和提升低频词的影响，TF-IDF使用文档频率的倒数，也就是逆向文档频率（inverse document frequency）来对词频率进行加权。这很好理解，一个词的文档频率越高，其倒数就越小，权重就越小。\n",
    "\n",
    "$$\\text{idf}_t = \\log \\frac{N}{\\text{df}_t}$$\n",
    "\n",
    "其中$N$表示文档总数。为了避免分母为0，通常会将分母改为$\\text{df}_t+1$。\n",
    "\n",
    "基于词频率和逆向文档频率，得到TF-IDF的最终值为：\n",
    "\n",
    "$$w_{t,d} = \\text{tf}_{t,d} \\times \\text{idf}_{t}$$\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f765e353",
   "metadata": {},
   "source": [
    "很多情况下会额外对文档的TF-IDF向量使用L2归一化，使得不同文档的TF-IDF向量具有相同的模长，便于相互比较。\n",
    "下面给出了TF-IDF的代码实现。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "9ce8e610",
   "metadata": {},
   "outputs": [],
   "source": [
    "class TFIDF:\n",
    "    def __init__(self, vocab_size, norm='l2', smooth_idf=True,\\\n",
    "                 sublinear_tf=True):\n",
    "        self.vocab_size = vocab_size\n",
    "        self.norm = norm\n",
    "        self.smooth_idf = smooth_idf\n",
    "        self.sublinear_tf = sublinear_tf\n",
    "    \n",
    "    def fit(self, X):\n",
    "        doc_freq = np.zeros(self.vocab_size, dtype=np.float64)\n",
    "        for data in X:\n",
    "            for token_id in set(data):\n",
    "                doc_freq[token_id] += 1\n",
    "        doc_freq += int(self.smooth_idf)\n",
    "        n_samples = len(X) + int(self.smooth_idf)\n",
    "        self.idf = np.log(n_samples / doc_freq) + 1\n",
    "    \n",
    "    def transform(self, X):\n",
    "        assert hasattr(self, 'idf')\n",
    "        term_freq = np.zeros((len(X), self.vocab_size), dtype=np.float64)\n",
    "        for i, data in enumerate(X):\n",
    "            for token in data:\n",
    "                term_freq[i, token] += 1\n",
    "        if self.sublinear_tf:\n",
    "            term_freq = np.log(term_freq + 1)\n",
    "        Y = term_freq * self.idf\n",
    "        if self.norm:\n",
    "            row_norm = (Y**2).sum(axis=1)\n",
    "            row_norm[row_norm == 0] = 1\n",
    "            Y /= np.sqrt(row_norm)[:, None]\n",
    "        return Y\n",
    "    \n",
    "    def fit_transform(self, X):\n",
    "        self.fit(X)\n",
    "        return self.transform(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9379c8b3-33b8-46af-a935-4f09eb35eb4d",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
